*****************************************************************************
* ALL COMPANIES THAT BUILD OR BUY WITH INTERNAL HUMAN CAPITAL (HC)
*****************************************************************************
clear 
use "${input_stata}\marche_f_HC_t0_nbheur.dta" // Load initial dataset

* Merging additional datasets by matching on sirtg, year, and code
merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t0_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t0_s_brut"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_nbheur"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\marche_f_HC_t1_s_brut"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\HC_secteur"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\distance_within_nombre"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\distance_bvrs"
drop _merge
rename distance distance_bvrs 

merge 1:1 sirtg year code using "${input_stata}\inv_final.dta"
drop _merge

merge 1:1 sirtg year code using "${input_stata}\HC_alternative.dta"
drop _merge

merge m:1 sirtg year using "${input_stata}\HC_own_sector_t1_nombre"
drop _merge

rename code code_entry
tostring code_entry, replace
replace code_entry = "0" + code_entry if length(code_entry) < 3
replace code_entry = "0" + code_entry if length(code_entry) < 3

*****************************************************************************
* RESTRICTING TO COMPANIES IN THE MATCHED GROUP
*****************************************************************************
merge 1:1 sirtg year code_entry using "${input_stata}\list_build.dta"
drop _merge

merge 1:1 sirtg year code_entry using "${input_stata}\list_buy.dta", update
drop _merge

*****************************************************************************
* CONTROL VARIABLES (FIXED AT THE SIRTG LEVEL, TAKEN FROM THE FIRST SAMPLE YEAR)
*****************************************************************************
merge m:1 sirtg using "${input_stata}\ctrl_var_t0.dta"
drop _merge

merge m:1 sirtg year using "${input_stata}\ctrl_var_t1.dta"
drop _merge

* Generating control variables for t0 and t1 periods
gen vaj_eff_t0 = vaj_t0 / (eff_3112_t0 * 1000) // Value-added per employee (t0)
gen diversity_t0 = nb_pcs_t0 / eff_3112_t0 // Diversity (t0)
gen size_t0 = log(eff_3112_t0) // Log of firm size (t0)
gen immo_eff_t0 = immocorp_t0 / (eff_3112_t0 * 1000) // Tangible assets per employee (t0)
gen sal_eff_t0 = (sum_s_brut_t0 / eff_3112_t0) * 1000 // Wages per employee (t0)
gen tresact_eff_t0 = tresact_t0 / (eff_3112_t0 * 1000) // Cash per employee (t0)

gen vaj_eff_t1 = vaj_t1 / (eff_3112_t1 * 1000) // Value-added per employee (t1)
gen diversity_t1 = nb_pcs_t1 / eff_3112_t1 // Diversity (t1)
gen size_t1 = log(eff_3112_t1) // Log of firm size (t1)
gen immo_eff_t1 = immocorp_t1 / (eff_3112_t1 * 1000) // Tangible assets per employee (t1)
gen sal_eff_t1 = (sum_s_brut_t1 / eff_3112_t1) * 1000 // Wages per employee (t1)
gen tresact_eff_t1 = tresact_t1 / (eff_3112_t1 * 1000) // Cash per employee (t1)

* Generating group variable combining entry code, industry, and sales type
egen orig_dest_size = group(code_entry apgr_1 type_sales_1)

* Creating dummy variables based on entry type
tab entree
gen buy = entree == "external" if entree != "" // External entry
gen int_newfirm = entree == "int_newfirm" if entree != "" // Internal entry (new firm)
gen int_oldfirm = entree == "int_const" if entree != "" // Internal entry (existing firm)

* Convert string variables to numeric
destring apgr_1, gen(apgr_1_num) force
destring code_entry, gen(code_entry_num) force

***************************
* LISTED COMPANIES
***************************
merge m:1 sirtg year using "${output_stata}\listed_group.dta"
keep if _merge == 1 | _merge == 3 
replace listed_gr = 0 if _merge == 1
drop _merge

*************************
* ENTRY AND EXIT DISTANCES
*************************
* Creating indicators for industry similarity
gen same1 = substr(apgr_1, 1, 1) == substr(code_entry, 1, 1) // Same 1-digit industry
gen same2 = substr(apgr_1, 1, 2) == substr(code_entry, 1, 2) // Same 2-digit industry

* Generating upstream and downstream relationships based on input-output tables
* Generating 2-digit industry codes for upstream/downstream analysis
gen apgr_1_2 = substr(apgr_1, 1, 2)
destring apgr_1_2, replace force
gen code_2 = substr(code_entry, 1, 2)
destring code_2, replace force

**** UPSTREAM RELATIONSHIPS ****
rename (apgr_1_num code_entry_num) (to from)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta"
drop if _merge == 2 // Drop unmatched records
drop _merge

* Repeat merging with updated columns for upstream data
rename (to from) (apgr_1_num code_entry_num)
rename (apgr_1_2 code_entry_num) (to from)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (to from) (apgr_1_2 code_entry_num)
rename (apgr_1_num code_2) (to from)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (to from) (apgr_1_num code_2)
rename (apgr_1_2 code_2) (to from)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (to from) (apgr_1_2 code_2)
rename share link_upstream 

**** DOWNSTREAM RELATIONSHIPS ****
rename (apgr_1_num code_entry_num) (from to)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta"
drop if _merge == 2 
drop _merge

* Repeat downstream merging with updates for variables
rename (from to) (apgr_1_num code_entry_num)
rename (apgr_1_2 code_entry_num) (from to)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (from to) (apgr_1_2 code_entry_num)
rename (apgr_1_num code_2) (from to)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (from to) (apgr_1_num code_2)
rename (apgr_1_2 code_2) (from to)

merge m:1 from to using "${input_r}\sources\Input_Output\io_2017.dta", update
drop if _merge == 2
drop _merge

rename (from to) (apgr_1_2 code_2)
rename share link_downstream // Rename link variable

************************
* ENTRY INTO NEW GEOGRAPHIC ZONES
************************
merge m:1 sirtg code_entry year using "${input_stata}\entry_geo_dep.dta"
drop if _merge == 2 // Drop unmatched records
drop _merge

merge m:1 sirtg code_entry year using "${input_stata}\entry_geo_reg.dta"
drop if _merge == 2 
drop _merge

*************************
* CREATION OF NEW FIRMS & CROSS-SECTION SECTOR DATA
*************************
merge m:1 code_entry year using "${input_stata}\creation.dta"
drop if _merge == 2
drop _merge

* Converting and merging with cross-sectional sector data
destring code_entry, force gen(code)
merge m:1 code year using "${input_stata}\cross_section_secteur.dta"
drop if _merge == 2 
drop _merge
drop code

*************************
* VARIABLES
*************************
* (1) New firm creation variable
gen g_creation = (nombre - l1_nombre) / l1_nombre
summ nombre g_creation, d

* (2) Growth sector sales calculation
gen g_secteur_sales = (ca - l1_ca) / l1_ca
summ ca g_secteur_sales, d

* (3) Competition (Herfindahl index)
summ herfindahl, d 

* (4) Sector risk/uncertainty variables
summ sd_time_growth_agg_ca sd_growth_siren_ca

* (5) Winsorizing to handle outliers
winsor2 g_secteur_sales g_creation, cut(1 99) replace

* Converting string variables and generating new variables
destring sirtg, gen(sirtg_num) force
summ sales l1_cagr, d 
gen share = sales / l1_cagr 
summ share, d

* Generating human capital distance variables
local var _nbheur _s_brut _nombre 
foreach x of local var {
    gen HC_distance`x'_t0 = 1 - HC_overlap_t0`x'
    gen HC_distance`x'_t1 = 1 - HC_overlap_t1`x' 
}

gen HC_distance_secteur = 1 - HC_secteur

* Winsorizing additional variables
winsor2 HC_overlap* vaj_eff_t1 diversity_t1 size_t1 immo_eff_t1 sal_eff_t1 tresact_eff_t1 sales share, cut(1 99) replace

*************************
* LABELING VARIABLES
*************************
label var buy "\mathbb{1}(Buy)$\_{\textit{g,n,t}}$"
label var buy "\mathbb{1}(Buy)$\_{\textit{g,n,t}}$"
label var int_newfirm "\mathbb{1}(Build new firm)$\_{\textit{g,n,t}}$"
label var int_oldfirm "\mathbb{1}(Build old firm)$\_{\textit{g,n,t}}$"
label var HC_distance_nombre_t1 "HC Distance$\_{\textit{g,n,t-1}}$ ($\#$ workers)"
label var HC_distance_s_brut_t1 "HC Distance$\_{\textit{g,n,t-1}}$"
label var HC_distance_nbheur_t1 "HC Distance$\_{\textit{g,n,t-1}}$ (hours)"
label variable HC_distance_nombre_t0 "HC Distance$\_{\textit{g,n,t0}}$ ($\#$ workers)"
label variable HC_distance_nbheur_t0 "HC Distance$\_{\textit{g,n,t0}}$ (hours)"
label variable HC_distance_s_brut_t0 "HC Distance$\_{\textit{g,n,t0}}$"
label variable HC_distance_secteur "HC Distance$\_{\textit{g,n,t-1}}^{Sector}$" // will be modified
label variable vaj_eff_t0 "Value added/N. Employees$\_{\textit{g,t0}}$"
label variable diversity_t0 "N. Occupations/N. Employees$\_{\textit{g,t0}}$"
label variable size_t0 "log(N. Employees)$\_{\textit{g,t0}}$"
label variable immo_eff_t0 "Tangible Assets/N. Employees$\_{\textit{g,t0}}$"
label variable sal_eff_t0 "Total wages/N. Employees$\_{\textit{g,t0}}$"
label variable tresact_eff_t0 "Cash/N. Employees$\_{\textit{g,t0}}$"
label variable vaj_eff_t1 "Value added/N. Employees$\_{\textit{g,t-1}}$"
label variable diversity_t1 "N. Occupations/N. Employees$\_{\textit{g,t-1}}$"
label variable size_t1 "log(N. Employees)$\_{\textit{g,t-1}}$"
label variable immo_eff_t1 "Tangible Assets/N. Employees$\_{\textit{g,t-1}}$"
label variable sal_eff_t1 "Total wages/N. Employees$\_{\textit{g,t-1}}$"
label variable tresact_eff_t1 "Cash/N. Employees$\_{\textit{g,t-1}}$"
label var share "Entry sales/Lagged total sales$\_{\textit{g,n,t}}$"
replace sales=sales/1000
label var sales "Entry sales (M\euro{})$\_{\textit{g,n,t}}$"
label variable listed_gr "\mathbb{1}(Public)$\_{\textit{g,t-1}}$" 
label var same1 "\mathbb{1}(Same 1-digit Industry)$\_{\textit{o,n}}$" 
label var same2 "\mathbb{1}(Same 2-digit Industry)$\_{\textit{o,n}}$"

gen upstream01 = link_upstream > 0.01
gen upstream05 = link_upstream > 0.05
gen upstream10 = link_upstream > 0.1
gen upstream20 = link_upstream > 0.2
label var link_upstream "Upstream link$\_{\textit{o,n}}$"
label var upstream05 "$\mathbb{1}$(Upstream link $>$5\%)$\_{\textit{o,n}}$" 
label var upstream01 "$\mathbb{1}$(Upstream link $>$1\%)$\_{\textit{o,n}}$" 

gen downstream01 = link_downstream > 0.01
gen downstream05 = link_downstream > 0.05
gen downstream10 = link_downstream > 0.1
gen downstream20 = link_downstream > 0.2
label var link_downstream "Downstream link$\_{\textit{o,n}}$"
label var downstream05 "$\mathbb{1}$(Downstream link $>$5\%)$\_{\textit{o,n}}$" 
label var downstream01 "$\mathbb{1}$(Downstream link $>$1\%)$\_{\textit{o,n}}$" 
label var distance_bvrs "Product market distance$\_{\textit{g,n,t-1}}$"

label var entree_dep "\mathbb{1}(New department)$\_{\textit{g,t}}$" 
label var entree_reg "\mathbb{1}(New region)$\_{\textit{g,t}}$"
label variable nombre "N. New firms$\_{\textit{n,t-1}}$" 
drop diversifie
gen diversifie = nb_secteur > 1
label variable diversifie "\mathbb{1}(Diversified)$\_{\textit{g,t-1}}$"
label variable nb_secteur "N. Sectors$\_{\textit{g,t-1}}$"
summ nb_secteur diversifie
summ share sales


******************
* FIXED EFFECTS
******************
egen orig_size = group(type_sales_1 apgr_1)
egen orig_dest = group(code_entry apgr_1)
egen orig_dest_year = group(code_entry apgr_1 year)
egen orig1_orig2_dest = group(code_entry apgr_1 apgr_2)
egen orig_labprod_growth_dest = group(code_entry apgr_1 type_labprod type_growth)

save "${output_stata}\main_regression.dta", replace
